import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
data = pd.read_csv("diabetes2.csv")
data.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
data_temp = data
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
data.shape
(768, 9)
data['Outcome'].value_counts()
0 500 1 268 Name: Outcome, dtype: int64
data.isnull().sum()
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
sns.catplot(x="Outcome",kind="count",data=data)
plt.show()
# The visulization count of Age of their Diabetics
ax = sns.catplot(x="Age",kind="count",hue="Outcome",data=data,palette="pastel",legend=False)
ax.fig.set_figwidth(20)
ax.fig.set_figheight(10)
plt.legend(loc="upper right",labels=["Non diabetic","Diabetic"])
plt.show()
# Age Distribution by outcome 0
fig = px.histogram(data,x=data[data["Outcome"]==0].Age,marginal="box",color_discrete_sequence=['lightgreen'])
fig.show()
# Age distribution by Outcome 0
fig = px.histogram(data, x=data[data.Outcome==1].Age,
marginal="box",
color_discrete_sequence=['red'])
fig.show()
data[data['Outcome']==1].Glucose.mean()
141.25746268656715
x = data.drop(['Outcome'],axis=1)
y = data.loc[:,"Outcome"].values
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size = 0.33, random_state= 123)
print("xtrain",xtrain.shape)
print("ytrain",ytrain.shape)
print("xtest",xtest.shape)
print("ytest",ytest.shape)
xtrain (514, 8) ytrain (514,) xtest (254, 8) ytest (254,)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', max_iter = 1000)
model.fit(xtrain,ytrain)
LogisticRegression(max_iter=1000, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000, solver='liblinear')
xpred = model.predict(xtrain)
from sklearn.metrics import confusion_matrix
confusion_matrix(ytrain,xpred)
array([[312, 31],
[ 90, 81]], dtype=int64)
#train score
from sklearn.metrics import accuracy_score
score = accuracy_score(ytrain, xpred)
score
0.7645914396887159
ypred = model.predict(xtest)
confusion_matrix(ypred, ytest)
array([[143, 38],
[ 14, 59]], dtype=int64)
#test score
accuracy_score(ypred, ytest)
0.7952755905511811
cm1 = confusion_matrix(ytest, ypred)
sns.heatmap(cm1, annot=True, fmt=".0f")
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.title("Accuracy Score:{0}".format(score),size=15)
plt.show()
from sklearn.metrics import classification_report,accuracy_score,f1_score,precision_score,recall_score,roc_curve,roc_auc_score
print(classification_report(ytest,ypred))
precision recall f1-score support
0 0.79 0.91 0.85 157
1 0.81 0.61 0.69 97
accuracy 0.80 254
macro avg 0.80 0.76 0.77 254
weighted avg 0.80 0.80 0.79 254
test_data = [[0,150,33.7,50,150,74,0.5,53]]
testData = pd.DataFrame(test_data,columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'])
result = model.predict(testData)
result[0]
1
import joblib
joblib.dump(model,"diabetics.pkl")
['diabetics.pkl']